"""

Write Preprocessing Code Here
1. There are certain preprocessing that needs to be done here and certain that 
I can do after in the lsi code.
"""
"""count =0;
for line in open ("D:\Research\Rshabarth\data\chromium_issue_report.csv"):
    print line
    print "\n========================="
    text = line[4]
    count = count+1
    print "text is=",text
    if count <=1:
        break"""
#import "D:\\Research\\temp\\preprocess"        
from preprocess import  pp 
import logging, gensim, bz2
import MySQLdb
from gensim import corpora, similarities, models
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

dictionary= corpora.Dictionary();
print dictionary.token2id

"""for line in  open('D:\hi.txt'):
    val =  line.lower().split()
    print val
    dictionary.add_documents(val)
 
             

print dictionary.token2id

"""
"""
def preprocess(line):
    line="hello hi"
    return line
"""

    
        #             dictionary = corpora.Dictionary(line.lower().split())             # assume there's one document per line, tokens separated by whitespace
# @Not a comment dictionary = corpora.Dictionary(pp(line).split() for line in open('D:\Research\Rshabarth\data\chromium_issue_report.csv'))             

dictionary = corpora.Dictionary(pp(line).split() for line in open('D:\hi.txt'))             
print dictionary.token2id

freq={}
for id in dictionary:
    term =dictionary[id]
    freq[term]=0
    for line in open('D:\hi.txt'):
        line = pp(line)
        line_data = line.split()
        for word in line_data:
            if word==term:
                freq[term]+=1
        
        """for word in words:
            if freq.has_key(word):
                freq[word] += 1
            else:
                freq[word] = 1"""
    
print freq    
    